# ==============================================================================
# name: RQ3c-response-instability.R
# date:	Jan 25, 2022
# author: Bernhard Clemm / Tiago Ventura
# purpose: compare professionals / non-professionals in between-wave response stability
# ==============================================================================

rm(list = ls())

source("code/utils/constants.R")
source("code/utils/styles.R")

# READ IN AND FILTER DATA ======================================================

fb <- read.csv("data/analysis_FB.csv")

lu <- read.csv("data/analysis_LU.csv")

yg <- read.csv("data/analysis_YG.csv")

vars_fb <- sub("_1$", "", vars_w1_fb)
waves_fb <- fb %>%
  select(
    dataset, wave, person_id, weight,
    age, female, edu_high, white, party, ideo,
    n_days_active, participated, professional_1, all_of(vars_fb)
  ) %>%
  pivot_wider(
    id_cols = c(person_id, weight, dataset), names_from = wave,
    values_from = c(
      n_days_active, participated, professional_1, all_of(vars_fb),
      age, female, edu_high, white, party, ideo
    ),
    names_glue = "{.value}_{wave}"
  ) %>%
  # some variables are constant across waves - we just included them in pivot bc of NAs in W2
  rename(
    "professional_1" = professional_1_1,
    "age" = age_1, "female" = female_1,
    "edu_high" = edu_high_1, "white" = white_1, "party" = party_1,
    "ideo" = ideo_1
  ) %>%
  # drop rows if person did not participate in wave
  filter(participated_1 == TRUE & participated_2 == TRUE) %>%
  filter(n_days_active_1 >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as.data.frame()

vars_lu <- sub("_w1$", "", vars_w1_lu)
waves_lu <- lu %>%
  select(
    dataset, wave, person_id, weight,
    age, female, edu_high, white, party, ideo,
    n_days_active, participated, professional_1, all_of(vars_lu)
  ) %>%
  pivot_wider(
    id_cols = c(person_id, weight, dataset),
    names_from = wave,
    values_from = c(
      n_days_active, participated, professional_1, all_of(vars_lu),
      age, female, edu_high, white, party, ideo
    ),
    names_glue = "{.value}_w{wave}"
  ) %>%
  # some variables are constant across waves - we just included them in pivot bc of NAs in W2
  rename(
    "professional_1" = professional_1_w1,
    "age" = age_w1, "female" = female_w1,
    "edu_high" = edu_high_w1, "white" = white_w1, "party" = party_w1,
    "ideo" = ideo_w1
  ) %>%
  # drop rows if person did not participate in wave
  filter(participated_w1 == TRUE & participated_w2 == TRUE) %>%
  filter(n_days_active_w1 >= 7) %>%
  mutate(person_id = as.character(person_id)) %>%
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as.data.frame()

waves_yg <- yg %>%
  select(
    dataset, person_id, weight, n_days_active, professional_1,
    age, female, edu_high, white, party, ideo,
    participated_w1, participated_w2,
    all_of(vars_w1_yg), all_of(vars_w2_yg)
  ) %>%
  filter(n_days_active >= 7) %>%
  # drop rows if person did not participate in wave
  filter(participated_w1 == TRUE & participated_w2 == TRUE) %>%
  mutate(person_id = as.character(person_id)) %>%
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as.data.frame() %>%
  # recode some NA values
  mutate_at(vars(contains("vote_house")), ~ ifelse(.x > 12, NA, .x)) %>%
  mutate_at(vars(contains("ideo_")), ~ ifelse(.x > 100, NA, .x)) %>%
  mutate_at(vars(contains("_self")), ~ ifelse(.x > 100, NA, .x)) %>%
  mutate_at(vars(contains("trump_eval")), ~ ifelse(.x == 8, NA, .x)) %>%
  mutate_at(vars(contains("_views")), ~ ifelse(.x > 100, NA, .x)) %>%
  mutate_at(vars(contains("_post_politics")), ~ ifelse(.x > 7, NA, .x)) %>%
  mutate_at(vars(
    contains("_post_politics"), contains("_national_econ"), contains("personal_finance"),
    contains("group_finance")
  ), ~ ifelse(.x > 7, NA, .x)) %>%
  mutate_at(vars(contains("_imm"), contains("_aca")), ~ ifelse(.x > 10, NA, .x)) %>%
  mutate_at(vars(contains("_knowledge_")), ~ ifelse(.x == 8, NA, .x))

# MAIN PAPER ===================================================================

## Stan model call ####

options(mc.cores = parallel::detectCores()) # run chains in parallel

mod_het <- stan_model("code/RQ3c_heterocedastic_model.stan")

run_bayesian_model <- function(dt, vars_w1, vars_w2, sample, seed = 2025) {
  n_models <- length(vars_w1)

  res <- vector("list", n_models)

  pb <- txtProgressBar(min = 0, max = n_models, style = 3)

  for (i in 1:length(vars_w1)) {
    # Convert strings to symbols
    var_w1 <- sym(vars_w1[[i]])
    var_w2 <- sym(vars_w2[[i]])

    dt_model <- dt %>%
      select(professional_1, !!var_w1, !!var_w2, weight) %>%
      mutate_at(vars(!!var_w1, !!var_w2), ~ as.numeric(scale(.x, center = TRUE))) %>%
      drop_na()

    dt_stan <- list(
      N       = nrow(dt_model),
      t1      = as.vector(dt_model[[2]]), # plain numeric vectors
      pro     = as.integer(dt_model[[1]]),
      t2      = as.vector(dt_model[[3]]),
      weights = as.vector(dt_model[[4]])
    )

    # estimate stan model
    fit <- sampling(
      object = mod_het,
      data = dt_stan,
      chains = 4,
      iter = 1000,
      seed = seed + i,
      refresh = 0
    )

    # Extract posterior samples for sigma_pro and sigma_nonpro
    draws <- extract(fit)

    # Calculate the difference between sigma_pro and sigma_nonpro
    sigma_diff <- draws$sigma_pro - draws$sigma_nonpro
    ci_sigma_diff <- quantile(sigma_diff, probs = c(0.025, 0.5, 0.975))

    res[[i]] <- tibble(
      median = ci_sigma_diff[[2]],
      lo = ci_sigma_diff[[1]],
      up = ci_sigma_diff[[3]],
      mean = mean(sigma_diff),
      sd = sd(sigma_diff),
      zscore = mean / sd,
      variable = str_remove(vars_w1[[i]], "w1_")
    )

    setTxtProgressBar(pb, i)
  }

  close(pb)

  # clean the data
  res_clean <- res %>%
    bind_rows() %>%
    mutate(
      color = case_when(
        median > 0 & lo < 0 ~ "Includes zero",
        median > 0 & lo > 0 ~ "Does not include zero",
        median < 0 & up > 0 ~ "Includes zero",
        median < 0 & up < 0 ~ "Does not include zero"
      ),
      sample = sample
    )

  return(res_clean)
}

bayesian_fb <- run_bayesian_model(
  dt = waves_fb, vars_w1 = vars_w1_fb,
  vars_w2 = vars_w2_fb, sample = "Facebook"
)

bayesian_lu <- run_bayesian_model(
  dt = waves_lu, vars_w1 = vars_w1_lu,
  vars_w2 = vars_w2_lu, sample = "Lucid"
)

bayesian_yg <- run_bayesian_model(
  dt = waves_yg, vars_w1 = vars_w1_yg,
  vars_w2 = vars_w2_yg, sample = "YouGov"
)

bayesian_all <- bind_rows(bayesian_fb, bayesian_lu, bayesian_yg)

bayesian_all <- bayesian_all %>%
  mutate(
    y = zscore,
    significant = ifelse(y > 1.96 | y < -1.96, 1, 0)
  ) %>%
  group_by(sample) %>%
  mutate(pvalue = 1 - mean(significant, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(sample = paste(sample, " \n p-value =", round(pvalue, 2)))

## Figure 4: Z-scores for the difference in standard deviation between professionals and non-professionals ####

bayesian_all %>%
  ggplot(aes(x = y, fill = factor(after_stat(abs(x) > 1.96)), y = 1)) +
  geom_density_ridges_gradient(
    quantile_lines = TRUE, quantiles = c(0.5), alpha = .4,
    jittered_points = TRUE, point_size = 3
  ) +
  labs(y = "Density", x = "z-score for difference in SD between professionals and non-professionals") +
  facet_grid(~sample, scales = "free") +
  theme(
    plot.background = element_rect(fill = "transparent"),
    panel.background = element_rect(fill = "transparent"),
    panel.grid.major = element_blank()
  ) +
  scale_fill_manual(values = c("transparent", "gray75")) +
  guides(fill = FALSE)

ggsave("output/fig4_rq3_zscores_bayesian.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM E.2.1 Bayesian heteroscedastic linear model ==================================

## Figure E.16: Credible intervals for Stan model ####

bayesian_all %>%
  ggplot(
    aes(
      x = median,
      y = variable
    )
  ) +
  labs(x = "Difference in SD", y = "Variable") +
  geom_vline(xintercept = 0, size = 0.25) +
  geom_segment(
    aes(
      x = lo, xend = up,
      y = variable, yend = variable
    ),
    size = 2, color = "grey85"
  ) +
  geom_point(aes(color = color), size = 2) +
  scale_color_manual(values = c("gray70", "darkred"), name = "Credible interval") +
  theme(
    legend.position = "bottom",
    axis.text.y = element_blank(),
    panel.grid.major = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  facet_wrap(~sample, scales = "free", ncol = 3)

ggsave("output/figE16_rq3_differences_bayesian.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

# SM E.2.2 Absolute-difference OLS model ==========================================

run_absdiff_model <- function(
    dt, vars_w1, vars_w2,
    prof_var = "professional_1",
    controls = TRUE,
    weight_var = "weight",
    sample = NA_character_,
    seed = 2025) {
  stopifnot(length(vars_w2) == length(vars_w1))

  # default controls if the user just says controls = TRUE
  default_ctrl <- c("age", "female", "edu_high", "white", "party", "ideo")
  ctrl_vars <- if (isTRUE(controls)) {
    default_ctrl
  } else if (is.character(controls)) {
    controls
  } else {
    character(0)
  }

  run_one <- function(w2, w1) {
    dt_model <- dt %>%
      dplyr::select(dplyr::all_of(c(prof_var, w2, w1, weight_var, ctrl_vars))) %>%
      dplyr::mutate(diff = abs(.data[[w2]] - .data[[w1]])) %>%
      tidyr::drop_na(diff, !!sym(prof_var), !!sym(weight_var))

    # diff ~ treatment [+ controls]
    fml <- reformulate(c(prof_var, ctrl_vars), response = "diff")

    fit <- estimatr::lm_robust(
      fml,
      data    = dt_model,
      weights = dt_model[[weight_var]],
      se_type = "HC2"
    )

    sd_ctrl <- dt_model %>%
      dplyr::filter(.data[[prof_var]] == 0) %>%
      dplyr::pull(diff) %>%
      sd(na.rm = TRUE)

    broom::tidy(fit, conf.int = TRUE) %>%
      dplyr::filter(term == prof_var) %>%
      dplyr::mutate(
        estimate   = estimate / sd_ctrl,
        std.error  = std.error / sd_ctrl,
        conf.low   = conf.low / sd_ctrl,
        conf.high  = conf.high / sd_ctrl,
        outcome    = stringr::str_remove(w1, "w1_"),
        label      = "Heterogeneous Effect",
        color      = ifelse(p.value < .05, "p-Value < 0.05", "p-Value > 0.05"),
        sample     = sample,
        controls   = length(ctrl_vars) > 0
      ) %>%
      dplyr::select(
        term, estimate, std.error, conf.low, conf.high,
        p.value, outcome, label, color, sample, controls
      )
  }

  purrr::map2_dfr(vars_w2, vars_w1, run_one)
}

absdiff_fb <- run_absdiff_model(
  dt = waves_fb, vars_w1 = vars_w1_fb,
  vars_w2 = vars_w2_fb, sample = "Facebook"
)

absdiff_lu <- run_absdiff_model(
  dt = waves_lu, vars_w1 = vars_w1_lu,
  vars_w2 = vars_w2_lu, sample = "Lucid"
)

absdiff_yg <- run_absdiff_model(
  dt = waves_yg, vars_w1 = vars_w1_yg,
  vars_w2 = vars_w2_yg, sample = "YouGov"
)

absdiff_all <- bind_rows(absdiff_fb, absdiff_lu, absdiff_yg)

absdiff_all <- absdiff_all %>%
  mutate(
    y = estimate / std.error,
    significant = ifelse(y > 1.96 | y < -1.96, 1, 0)
  ) %>%
  group_by(sample) %>%
  mutate(pvalue = 1 - mean(significant, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(sample = paste(sample, " \n p-value =", round(pvalue, 2)))

### Figure E.17: Z-scores for absolute-differences model ####

absdiff_all %>%
  ggplot(aes(x = y, fill = factor(after_stat(abs(x) > 1.96)), y = 1)) +
  geom_density_ridges_gradient(
    quantile_lines = TRUE, quantiles = c(0.5), alpha = .4,
    jittered_points = TRUE, point_size = 3
  ) +
  labs(y = "Density", x = "z-score for effects of professionalism on absolute difference") +
  facet_grid(~sample, scales = "free") +
  theme(
    plot.background = element_rect(fill = "transparent"),
    panel.background = element_rect(fill = "transparent"),
    panel.grid.major = element_blank()
  ) +
  scale_fill_manual(values = c("transparent", "gray75")) +
  guides(fill = FALSE)

ggsave("output/figE17_rq3_zscores_absdiff.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)

### Figure E.18: Credible intervals for absolute-differences model####

ggplot(
  absdiff_all %>% filter(!is.na(std.error)),
  aes(
    x = estimate,
    y = outcome
  )
) +
  labs(x = "Marginal Effects of Professionals ", y = "Variable") +
  geom_vline(xintercept = 0, size = 0.25) +
  geom_segment(
    aes(
      x = conf.low, xend = conf.high,
      y = outcome, yend = outcome
    ),
    size = 2, color = "grey85"
  ) +
  geom_point(aes(color = color), size = 2) +
  scale_color_manual(
    values = c("darkred", "gray70"),
    name = ""
  ) +
  theme(
    legend.position = "bottom",
    axis.text.y = element_blank(),
    panel.grid.major = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  facet_wrap(~sample, scales = "free", ncol = 3)

ggsave("output/figE18_rq3_effects_absdiff.pdf",
  width = 12, height = 8, units = "in",
  pointsize = 12, bg = "white"
)
